This is an R Markdown document containing code and plots for Abumazen et al. “Nasopharyngeal metatranscriptomics reveals host-pathogen signatures of pediatric sinusitis”.

Loading required R packages

library(reshape2)
library(pheatmap)
library(pROC)
library(RColorBrewer)
library(dplyr)
library(plyr)
library(vioplot)
library(Metrics)
library(DESeq2)
library(tximport)
library(EnhancedVolcano)
library(patchwork)
library(knitr)

1. Kraken2-based pathogen detection from RNA-seq data

Loading data files


Reading in the kraken2 data

tb = read.delim("d1-d5_fastp_dehost_krakenmerged_reads_normalized.txt",sep='\t',col.names=(c("X","Y","Z")))

matr = dcast(tb,Y ~ X,value.var = "Z")
rownames(matr) = matr[,1]
matr = matr[,-1]

for (i in 1:nrow(matr))
{   matr[i,which(is.na(matr[i,]))] = 0
}

matr = t(data.matrix(matr))

splitrownames = vector(length = nrow(matr))
for (i in 1:length(splitrownames))
{
    rownames(matr)[i] = strsplit(rownames(matr),"_")[[i]][1]
}

Reading in the metadata

metadata = read.csv("metadata.12.20.updated.csv",header=T)
metadata = metadata[which(metadata$batch != 1),] #removing samples from batch 1

shared = intersect(rownames(matr),metadata[,1])
matr = matr[match(shared,rownames(matr)),]
metadata = metadata[match(shared,metadata[,1]),]

Bacterial predictions


Heatmap

annotation_row = metadata[,c(22,24,26)]
annotation_row = (annotation_row -2) * -1   #just to make it going 0 to 1
rownames(annotation_row) = metadata[,1]
colnames(annotation_row) = c("SPN", "HFLU", "MCAT")


cols = c("white",colorRampPalette((brewer.pal(n = 7, name =
  "YlGnBu")))(99))

ann_colors = list(
    SPN = c("white", "light gray"),
    HFLU = c("white", "light gray"),
    MCAT = c("white", "light gray")
)


pheatmap(log(matr[,c(grep("Moraxella cat", colnames(matr)),grep("Streptococcus pne", colnames(matr)),grep("Haemophilus inf", colnames(matr)))] + 1, 10),annotation_row=annotation_row[,c(2,1,3)],cluster_cols = F, show_rownames = FALSE,col=cols,annotation_colors = ann_colors)

Box Plots

  group = metadata[,"mcat"]
    group <- mapvalues(group, 
          from=c("2","1"), 
          to=c("-","+"))
    group = as.factor(group)

    abund = log(matr[,"Moraxella catarrhalis"] + 1,10)
  p = ggplot(data.frame(mcat = abund), aes(y = abund, x = group,fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  theme(legend.position = "none") +
    ggtitle("mcat") +    theme(plot.title = element_text(hjust = 0.5)) +
  labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
  geom_point(pch = 21, position = position_jitterdodge())

  p

  group = metadata[,"spn"]
        group <- mapvalues(group, 
          from=c("2","1"), 
          to=c("-","+"))
    group = as.factor(group)

    abund = log(matr[,"Streptococcus pneumoniae"] + 1,10)
    
  p = ggplot(data.frame(spn = abund), aes(y = abund, x = group,fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  theme(legend.position = "none") +
    ggtitle("spn") +     theme(plot.title = element_text(hjust = 0.5)) +
  labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") + 
  geom_point(pch = 21, position = position_jitterdodge())

  p

  group = metadata[,"hflu"]
        group <- mapvalues(group, 
          from=c("2","1"), 
          to=c("-","+"))
    group = as.factor(group)

    abund = log(matr[,"Haemophilus influenzae"] + 1,10)
     
  p = ggplot(data.frame(hflu = abund), aes(y = abund, x = group,fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  theme(legend.position = "none") +
    ggtitle("hflu") +    theme(plot.title = element_text(hjust = 0.5)) +
  labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
  geom_point(pch = 21, position = position_jitterdodge())

  p

M. catarrhalis prediction accuracy

bacterial_detection_threshold = 3 # gives a good balance of sensitivity and specificity

# number of HFLU detected

hflu = which(matr[,"Haemophilus influenzae"] >= bacterial_detection_threshold)
length(hflu)
## [1] 81
# number of SPN detected
spn = which(matr[,"Streptococcus pneumoniae"] >= bacterial_detection_threshold)
length(spn)
## [1] 73
# number of MCAT detected
mcat = which(matr[,"Moraxella catarrhalis"] >= bacterial_detection_threshold)
length(mcat)
## [1] 137
#any of the above three
length(unique(c(hflu,spn,mcat)))
## [1] 177
#two or more
length(unique(c(intersect(hflu,mcat),intersect(hflu,spn),intersect(spn,mcat))))
## [1] 89
#all three
length(intersect(intersect(hflu,mcat),spn))
## [1] 25
values = unique(rev(sort(matr[,"Moraxella catarrhalis"])))
TPR = vector(length = length(values)) 
FPR = vector(length = length(values)) 
for (i in 1:length(values))

{ k = values[i] 
    matches = which(matr[,"Moraxella catarrhalis"] >= k)
    TPR[i] = length(intersect(matches,which(metadata[,"mcat"] == 1))) / length(which(metadata[,"mcat"] == 1))
    FPR[i] = length(intersect(matches,which(metadata[,"mcat"] == 2))) / length(which(metadata[,"mcat"] == 2)) 
} 


prediction = matr[,"Moraxella catarrhalis"]
response = metadata[,"mcat"]
response = (response -2) * -1
auc(response,prediction)
## [1] 0.823844
auc_mcat = auc(response,prediction)

plot(FPR,TPR,type="l",main=paste("mcat","AUC = ",round(auc_mcat,2)))

matches = which(matr[,"Moraxella catarrhalis"] >= bacterial_detection_threshold)
tpr0 = length(intersect(matches,which(metadata[,"mcat"] == 1))) / length(which(metadata[,"mcat"] == 1))
fpr0 = length(intersect(matches,which(metadata[,"mcat"] == 2))) / length(which(metadata[,"mcat"] == 2)) 
points(x = fpr0, y = tpr0,cex=2)

# prints out the sensitivity and the specificity given the threshold defined above
print(c("mcat",tpr0 * 100,100-(fpr0 * 100)))
## [1] "mcat"             "84.7457627118644" "64.0776699029126"
sens_mcat = tpr0 * 100
spec_mcat = 100-(fpr0 * 100)

S. pneumoniae prediction accuracy

values = unique(rev(sort(matr[,"Streptococcus pneumoniae"])))
TPR = vector(length = length(values)) 
FPR = vector(length = length(values)) 
for (i in 1:length(values))

{ k = values[i] 
    matches = which(matr[,"Streptococcus pneumoniae"] >= k)
    TPR[i] = length(intersect(matches,which(metadata[,"spn"] == 1))) / length(which(metadata[,"spn"] == 1))
    FPR[i] = length(intersect(matches,which(metadata[,"spn"] == 2))) / length(which(metadata[,"spn"] == 2)) 
} 

prediction = matr[,"Streptococcus pneumoniae"]
response = metadata[,"spn"]
response = (response -2) * -1
auc(response,prediction)
## [1] 0.8915326
auc_spn = auc(response,prediction)

plot(FPR,TPR,type="l",main=paste("spn","AUC = ",round(auc_spn,2)))

matches = which(matr[,"Streptococcus pneumoniae"] >= bacterial_detection_threshold)
tpr0 = length(intersect(matches,which(metadata[,"spn"] == 1))) / length(which(metadata[,"spn"] == 1))
fpr0 = length(intersect(matches,which(metadata[,"spn"] == 2))) / length(which(metadata[,"spn"] == 2)) 
points(x = fpr0, y = tpr0,cex=2)

# prints out the sensitivity and the specificity given the threshold defined above
print(c("spn",tpr0 * 100,100-(fpr0 * 100)))
## [1] "spn"              "81.4285714285714" "89.4039735099338"
sens_spn = tpr0 * 100
spec_spn = 100-(fpr0 * 100)

H. influenzae prediction accuracy

values = unique(rev(sort(matr[,"Haemophilus influenzae"])))
TPR = vector(length = length(values)) 
FPR = vector(length = length(values)) 
for (i in 1:length(values))

{ k = values[i] 
    matches = which(matr[,"Haemophilus influenzae"] >= k)
    TPR[i] = length(intersect(matches,which(metadata[,"hflu"] == 1))) / length(which(metadata[,"hflu"] == 1))
    FPR[i] = length(intersect(matches,which(metadata[,"hflu"] == 2))) / length(which(metadata[,"hflu"] == 2)) 
} 

prediction = matr[,"Haemophilus influenzae"]
response = metadata[,"hflu"]
response = (response -2) * -1
auc(response,prediction)
## [1] 0.9549669
auc_hflu = auc(response,prediction)

plot(FPR,TPR,type="l",main=paste("hflu","AUC = ",round(auc_hflu,2)))

matches = which(matr[,"Haemophilus influenzae"] > bacterial_detection_threshold)
tpr0 = length(intersect(matches,which(metadata[,"hflu"] == 1))) / length(which(metadata[,"hflu"] == 1))
fpr0 = length(intersect(matches,which(metadata[,"hflu"] == 2))) / length(which(metadata[,"hflu"] == 2)) 
points(x = fpr0, y = tpr0,cex=2)

# prints out the sensitivity and the specificity given the threshold defined above
print(c("hflu",tpr0 * 100,100-(fpr0 * 100)))
## [1] "hflu"             "94.2857142857143" "90.0662251655629"
sens_hflu = tpr0 * 100
spec_hflu = 100-(fpr0 * 100)


accuracies = cbind(sens = c(mcat = sens_mcat,spn = sens_spn,hflu = sens_hflu),spec = c(mcat = spec_mcat,spec_spn,spec_hflu),auc = c(auc_mcat, auc_spn, auc_hflu))

kable(accuracies)
sens spec auc
mcat 84.74576 64.07767 0.8238440
spn 81.42857 89.40397 0.8915326
hflu 94.28571 90.06623 0.9549669

Viral predictions


Heatmap

viral_reads = matrix(ncol=12,nrow = nrow(matr))

colnames(viral_reads) = c("INFA","INFB","INFC","MPV","RSV","HRV","PIV1","PIV2","PIV3","PIV4","ADV","EVD68")

rownames(viral_reads) = rownames(matr)

viral_reads[,"INFA"] = matr[,"Influenza A virus"]

viral_reads[,"INFB"] = matr[,"Influenza B virus"]

viral_reads[,"INFC"] = matr[,"Influenza C virus"]

viral_reads[,"MPV"] = matr[,"Human metapneumovirus"]

viral_reads[,"RSV"] = matr[,"Respiratory syncytial virus"] + matr[,"Human orthopneumovirus"]

viral_reads[,"HRV"] = matr[,"Rhinovirus A"] + matr[,"Rhinovirus B"] + matr[,"Rhinovirus C"]

viral_reads[,"PIV1"] = matr[,"Human respirovirus 1"]

viral_reads[,"PIV2"] = matr[,"Human orthorubulavirus 2"]

viral_reads[,"PIV3"] = matr[,"Human respirovirus 3"]

viral_reads[,"PIV4"] = matr[,"Human orthorubulavirus 4"]

viral_reads[,"ADV"] = matr[,"Human mastadenovirus B"] + matr[,"Human mastadenovirus C"]

viral_reads[,"EVD68"] = matr[,"Enterovirus D"]



#computing the number of samples containing 0, 1, 2, 3, etc. viruses
viruses_presenceAbsence = viral_reads
viruses_presenceAbsence[viruses_presenceAbsence != 0] = 1
table(apply(viruses_presenceAbsence,1,sum))
## 
##  0  1  2  3  4  5  6 
## 46 74 65 24  8  3  1
#frequency of viral detections
rev(sort(apply(viruses_presenceAbsence,2,sum)))
##   HRV   MPV  INFA  PIV4  PIV3   RSV EVD68  INFB   ADV  PIV1  INFC  PIV2 
##   100    32    29    28    28    25    23    16    15    15    11     7
tomap = log(viral_reads + 1, 10)

annotation = metadata[,c(34,32,59,57,55,53,49,63,51,46,40,36)]
rownames(annotation) = metadata[,1]
colnames(annotation) = rev(colnames(tomap))

cols = c("white",colorRampPalette((brewer.pal(n = 7, name =
  "YlGnBu")))(99))

for (i in 1:ncol(annotation))
{
    annotation[,i] = as.factor(findInterval(annotation[,i],c(10,20,30,40)))

}

ann_colors = list(
    INFA = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    INFB = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    INFC = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    MPV = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    RSV = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    HRV = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    PIV1 = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    PIV2 = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    PIV3 = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    PIV4 = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    ADV = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black"),
    EVD68 = c("4" = "#F2F2F2", "3" = "#E6E6E6", "2" = "dark gray", "1" = "black")
)
pheatmap(tomap,
         annotation_row=annotation,
         annotation_colors = ann_colors,
         cluster_cols = F,
         show_rownames = F,
         col=cols
)

Box plots

viruses = c("INFA","INFB","INFC","MPV","RSV","HRV","PIV1","PIV2","PIV3","PIV4","ADV","EVD68")


ctthreshold = 35

annotation = metadata[,c(34,32,59,57,55,53,49,63,51,46,40,36)]
rownames(annotation) = metadata[,1]
colnames(annotation) = rev(colnames(tomap))

annotation[annotation >= ctthreshold] = NA

annotation[annotation < ctthreshold] = 1

annotation[is.na(annotation)] = 0

par(mfrow=c(4,3), mar=c(4, 4, 2.5, 1.5))

sens_scores = vector(length = 12) # for each virus
spec_scores = vector(length = 12) # for each virus

for (viralcount in 1:12)
{   
        virus = viruses[viralcount]

        values = unique(rev(sort(viral_reads[,virus])))
        TPR = vector(length = length(values)) 
        FPR = vector(length = length(values)) 
        for (i in 1:length(values))
        {   k = values[i] 
            matches = which(viral_reads[,virus] >= k)
            TPR[i] = length(intersect(matches,which(annotation[,virus] == 1))) / length(which(annotation[,virus] == 1))
            FPR[i] = length(intersect(matches,which(annotation[,virus] == 0))) / length(which(annotation[,virus] == 0)) 
        } 
        #plot(FPR,TPR,type="l",main=virus)

        boxplot(tomap[which(annotation[,virus] == 0),virus],tomap[which(annotation[,virus] == 1),virus],main=virus)

        matches = which(viral_reads[,virus] > 0)
        tpr0 = length(intersect(matches,which(annotation[,virus] == 1))) / length(which(annotation[,virus] == 1))
        fpr0 = length(intersect(matches,which(annotation[,virus] == 0))) / length(which(annotation[,virus] == 0)) 

        # prints out the sensitivity and the specificity
        #print(c(virus,tpr0 * 100,100-(fpr0 * 100)))

        sens_scores[viralcount] = as.numeric(tpr0 * 100)
        spec_scores[viralcount] = as.numeric(100 - (fpr0 * 100))
}

kable(cbind(virus = viruses,sens = sens_scores,spec = spec_scores))
virus sens spec
INFA 100 93.6585365853659
INFB 100 97.1563981042654
INFC 33.3333333333333 95.8139534883721
MPV 100 90.8653846153846
RSV 90 92.4170616113744
HRV 73.469387755102 77.2357723577236
PIV1 100 94.4954128440367
PIV2 100 98.6175115207373
PIV3 100 91.4691943127962
PIV4 90.9090909090909 91.4285714285714
ADV 44.4444444444444 96.551724137931
EVD68 100 90

Calculation of viral load and comparison with ct values

genomeSizes = c("INFA" = 13.63, "INFB" = 14.45, "INFC" = 12.9, "MPV" = 13.35, "RSV" = 15.222, "HRV" = 7.171333, "PIV1" = 15.6, "PIV2" = 15.646, "PIV3" = 15.462, "PIV4" = 17.052, "ADV" = 35.1435, "EVD68" = 7.367)

#check that names match
names(genomeSizes) == colnames(viral_reads)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
viral_reads_adjusted = viral_reads

for (i in 1:ncol(viral_reads_adjusted))
{
        viral_reads_adjusted[,i] = log((viral_reads[,i]/genomeSizes[i]) + 1, 10)
}

countsVsCT = 
rbind(

cbind(1,viral_reads_adjusted[,"INFA"],metadata[,"fluact"]),
cbind(2,viral_reads_adjusted[,"INFB"],metadata[,"flubct"]),
cbind(3,viral_reads_adjusted[,"INFC"],metadata[,"flucct"]),
cbind(4,viral_reads_adjusted[,"MPV"],metadata[,"MPV_Ct"]),
cbind(5,viral_reads_adjusted[,"RSV"],metadata[,"RSV_Ct"]),
cbind(6,viral_reads_adjusted[,"HRV"],metadata[,"HRV_Ct"]),
cbind(7,viral_reads_adjusted[,"PIV1"],metadata[,"PIV1.Ct"]),
cbind(8,viral_reads_adjusted[,"PIV2"],metadata[,"PIV2.Ct"]),
cbind(9,viral_reads_adjusted[,"PIV3"],metadata[,"PIV3.Ct"]),
cbind(10,viral_reads_adjusted[,"PIV4"],metadata[,"PIV4.Ct"]),
cbind(11,viral_reads_adjusted[,"ADV"],metadata[,"ADV_Ct"]),
cbind(12,viral_reads_adjusted[,"EVD68"],metadata[,"EV.D68_Ct"])
)


df = data.frame(x = 1/countsVsCT[,3], y = countsVsCT[,2], group = colnames(viral_reads)[countsVsCT[,1]])

p = ggplot(df, aes(x = x, y = y, color = group)) +
  geom_point() + geom_smooth(method=lm, se=FALSE,color="gray")

p

# Pearson correlation

cor.test(df$x,df$y,use = "complete")
## 
##  Pearson's product-moment correlation
## 
## data:  df$x and df$y
## t = 17.872, df = 252, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6878744 0.7973643
## sample estimates:
##       cor 
## 0.7476573

Optional: exploring prediction accuracy across multiple CT thresholds

avg_sensitivity = vector(length = 45)
avg_specificity = vector(length = 45)


for (count in 1:45) #this is the ct threshold

{
    ctthreshold = count

    annotation = metadata[,c(34,32,59,57,55,53,49,63,51,46,40,36)]
    rownames(annotation) = metadata[,1]
    colnames(annotation) = rev(colnames(tomap))

    annotation[annotation >= ctthreshold] = NA

    annotation[annotation < ctthreshold] = 1

    annotation[is.na(annotation)] = 0

    par(mfrow=c(4,3))

    sens_scores = vector(length = 12) # for each virus
    spec_scores = vector(length = 12) # for each virus

    for (viralcount in 1:12)
    {   
        virus = viruses[viralcount]

        values = unique(rev(sort(viral_reads[,virus])))
        TPR = vector(length = length(values)) 
        FPR = vector(length = length(values)) 
        for (i in 1:length(values))
        {   k = values[i] 
            matches = which(viral_reads[,virus] >= k)
            TPR[i] = length(intersect(matches,which(annotation[,virus] == 1))) / length(which(annotation[,virus] == 1))
            FPR[i] = length(intersect(matches,which(annotation[,virus] == 0))) / length(which(annotation[,virus] == 0)) 
        } 
        plot(FPR,TPR,type="l",main=virus)

        matches = which(viral_reads[,virus] > 0)
        tpr0 = length(intersect(matches,which(annotation[,virus] == 1))) / length(which(annotation[,virus] == 1))
        fpr0 = length(intersect(matches,which(annotation[,virus] == 0))) / length(which(annotation[,virus] == 0)) 

        # prints out the sensitivity and the specificity
        print(c(virus,tpr0 * 100,100-(fpr0 * 100)))

        sens_scores[viralcount] = as.numeric(tpr0 * 100)
        spec_scores[viralcount] = as.numeric(100 - (fpr0 * 100))

    }

    print(paste(count,mean(sens_scores)))
    print(paste(count,mean(spec_scores)))

    avg_sensitivity[count] = mean(sens_scores)
    avg_specificity[count] = mean(spec_scores)

}

Calculation of total bacterial pathogen and viral abundance


Here, we will calculate the total relative abundance of the three bacterial sinusitis pathogens (HFLU, MCAT, SPN), as well as the total relative abundance of respiratory viruses. This is done by summing their log10(RPM) values.

viral_pathogens = c("Respiratory syncytial virus","Enterovirus D","Human coronavirus 229E","Human coronavirus HKU1","Betacoronavirus 1","Human coronavirus NL63","Human mastadenovirus B","Human mastadenovirus C","Human metapneumovirus","Human orthopneumovirus","Human orthorubulavirus 2","Human orthorubulavirus 4","Human respirovirus 1","Human respirovirus 3","Influenza A virus","Influenza B virus","Influenza C virus","Rhinovirus A","Rhinovirus B","Rhinovirus C")

panel_viruses = c("Respiratory syncytial virus","Enterovirus D","Human mastadenovirus B","Human mastadenovirus C","Human metapneumovirus","Human orthopneumovirus","Human orthorubulavirus 2","Human orthorubulavirus 4","Human respirovirus 1","Human respirovirus 3","Influenza A virus","Influenza B virus","Influenza C virus","Rhinovirus A","Rhinovirus B","Rhinovirus C")

three_bacteria = c("Haemophilus influenzae", "Moraxella catarrhalis", "Streptococcus pneumoniae")



bacPathAbundance = apply(matr[,three_bacteria],1,sum)  # sum of their individual RPMs

viralPathAbundance = apply(matr[,viral_pathogens],1,sum)   # sum of their individual RPMs


metadata = metadata %>% mutate(threebac_quantile = ntile(bacPathAbundance, 10), viral_quantile = ntile(viralPathAbundance, 10))


metadata$high_bac<- ifelse(metadata$threebac_quantile >=6, 1, 0)
metadata$high_viral<- ifelse(metadata$viral_quantile >=6, 1, 0)

metadata$high_pathogens <- ifelse(metadata$threebac_quantile >=6 & metadata$viral_quantile >=6,3, 
                           ifelse(metadata$threebac_quantile >=6, 2,
                           ifelse(metadata$viral_quantile >=6, 1, 0)))

Exploratory analysis of newly identified pathogens


To explore, we will first subset the data to all viruses (species with “virus” in their name) with greater than log10(rpm) +1 > 0.3 (roughly 1 read per million), as well as any species with log10+1 abundance greater than 3 since this threshold includes our three bacterial sinusitis pathogens (SPN, HFU, MCAT).

matr.subset = matr[,unique(c(which(apply(log(matr + 1,10),2,max) > 3),intersect(grep("virus",colnames(matr)),which(apply(log(matr + 1,10),2,max) > 0.3))))]

pathTable = rev(sort(apply(log(matr.subset + 1,10),2,max)))
cols = vector(length = length(pathTable))
cols[] = "red"
cols[which(names(pathTable) %in% c(three_bacteria,panel_viruses))] = "black"

par(mfrow=c(2,1))
barplot(pathTable,col=cols,las=3)

pheatmap(t(log(matr[,names(pathTable)] + 1,10)),cluster_rows=F,fontsize_row=5)

The above list of species was then investigated further using BLAST and literature searches to identify a smaller subset of pathogens of interest.

pathogensOfInterest = c("Influenza A virus", "Influenza C virus", "Haemophilus influenzae", "Human coronavirus NL63", "Influenza B virus", "Betacoronavirus 1", "Respiratory syncytial virus", "Human coronavirus HKU1", "Human respirovirus 3", "Human orthopneumovirus", "Moraxella nonliquefaciens", "Fusobacterium nucleatum", "Human mastadenovirus C", "Human metapneumovirus", "Moraxella catarrhalis", "Prevotella intermedia", "Metamycoplasma salivarium", "Streptococcus pneumoniae", "Pasteurella multocida", "Human orthorubulavirus 4", "Tannerella forsythia", "Corynebacterium pseudodiphtheriticum", "Moraxella osloensis", "Enterovirus B", "Mycoplasma pneumoniae", "Enterovirus A", "Human orthorubulavirus 2", "Chlamydia pneumoniae", "Treponema medium", "Human coronavirus 229E", "Rhinovirus C", "Enterovirus D", "Rhinovirus A", "Human respirovirus 1", "Murine leukemia virus", "Rhinovirus B", "Human mastadenovirus B", "Human gammaherpesvirus 4", "Human betaherpesvirus 5", "Mamastrovirus 9", "Cardiovirus B", "Betapolyomavirus quartihominis", "Parechovirus A")


matr.subset = matr[,pathogensOfInterest]

pathTable = rev(sort(apply(log(matr.subset + 1,10),2,max)))
cols = vector(length = length(pathTable))
cols[] = "red"
cols[which(names(pathTable) %in% c(three_bacteria,panel_viruses))] = "black"
par(mfrow=c(2,1))
barplot(pathTable,col=cols,las=3,cex.names=0.5)

#plot the pathogen-positive samples (basedon culture/PCR)
breaksList = seq(0, 5, by = 1)
cols = c("white",colorRampPalette(brewer.pal(n = 4, name="YlGnBu"))(4))

temp.matr = t(log(matr[which(apply(metadata[,c(27,64)],1,sum) != 0),names(pathTable)] + 1,10))

temp.matr[temp.matr > 5] = 5 # adds ceiling to data to make color range consistent with next plot

pheatmap(temp.matr,cluster_rows=F,fontsize_row=5,fontsize_col=3,zlim=c(0,5),color =rev(cols), breaks=breaksList)

#plot the pathogen-negative samples (basedon culture/PCR)
pheatmap(t(log(matr[which(apply(metadata[,c(27,64)],1,sum) == 0),names(pathTable)] + 1,10)),cluster_rows=F,fontsize_row=5,fontsize_col=3,zlim=c(0,5),color = rev(cols),breaks=breaksList)

2. Gene Expression Analyses


Loading data and formatting metadata

genesymbols = read.delim("host-expression/gencode.v39.metadata.HGNC")

metadata = metadata[which(metadata$batch != 1),] #removing samples from batch 1
metadata[,"batch"] = as.factor(metadata[,"batch"])

metadata[,"bv_both_none"] = as.factor(metadata[,"bv_both_none"])

metadata[metadata[,"cum_pathogn"] > 1,"cum_pathogn"] = 1

metadata[,"cum_pathogn"] = as.factor(metadata[,"cum_pathogn"])

metadata[,"sex"] = as.factor(metadata[,"sex"])

metadata$age_scaled = scale(metadata$Age.in.years)

DESEQ2 pairwise comparison of bacteria-only vs virus-only group

onlyvirusonlybacteria_samples = metadata[which((metadata$bv_both_none == 1) | (metadata$bv_both_none == 2)),]

onlyvirusonlybacteria_samples = onlyvirusonlybacteria_samples %>% arrange(cum_pathogn)

subsetfiles <- file.path("host-expression/quants", onlyvirusonlybacteria_samples$Filename, "quant.sf")
subset_txi.salmon <- tximport(subsetfiles, type = "salmon", tx2gene = genesymbols)

dds <- DESeqDataSetFromTximport(subset_txi.salmon, onlyvirusonlybacteria_samples, ~ batch + sex + age_scaled + cum_pathogn)

dds <- DESeq(dds)

res <- results(dds)

summary(res)
## 
## out of 31126 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 2944, 9.5%
## LFC < 0 (down)     : 1891, 6.1%
## outliers [1]       : 0, 0%
## low counts [2]     : 8333, 27%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
pairwise_genes = which(res$padj < 0.001)

bac_upDEGs = intersect(which(res$padj < 0.001),which(res$log2FoldChange > 0))

viral_upDEGs = intersect(which(res$padj < 0.001),which(res$log2FoldChange < 0))

Number of DEGs detected:

There are 548 bacterial upDEGs and 273 viral upDEGs.

Volcano Plot

EnhancedVolcano(res,
  lab = rownames(res),
  x = 'log2FoldChange',
  y = 'pvalue',
  pCutoff = 0.001,
  FCcutoff = 0,
  xlim = c(-7, 7),
    ylim=c(-1,27),
  pointSize = 1.5,
  labSize = 2.5,
  title = 'DESeq2 results',
  subtitle = 'Differential expression',
  caption = 'p-value cutoff, 0.001',
  legendPosition = "none",
  legendLabSize = 14,
  col = c('grey30', 'light gray', 'royalblue', 'red2'),
  colAlpha = 0.9,
  drawConnectors = FALSE,
  widthConnectors = 0.5)

Host-response / pathogen abundance correlations

Read in full set of files, including those with NO pathogens and BOTH (virus and bacteria)

files <- file.path("host-expression/quants", metadata$Filename, "quant.sf")
txi.salmon <- tximport(files, type = "salmon", tx2gene = genesymbols)
pdds <- DESeqDataSetFromTximport(txi.salmon, metadata, ~ batch + sex + age_scaled + cum_pathogn)
prld <- varianceStabilizingTransformation(pdds, blind = TRUE)

Jitter plots of expression levels for selected genes (previous biomarkers)

These are plots of host gene expression per clinical group

pathOrNot = as.numeric(metadata[,"bv_both_none"]) # using clinical categories

genelist = c("CXCL10", "PRF1", "IFI27", "CCL8", "PTGS2", "S100A9", "PLAUR", "TNFAIP3","IL1B")
for (i in 1:length(genelist)) {
  gene = genelist[i] 
  print(gene)
  p = ggplot(data.frame(cbind(assay(prld)[gene,],pathOrNot)), aes(y = V1, x = as.factor(pathOrNot),fill=as.factor(pathOrNot))) +
  geom_boxplot(outlier.shape=NA) +
  ggtitle(gene) +
 theme(axis.title = element_blank(), legend.position="none") + 
    labs(y = "RNA-seq abundance\nlog10(rpm)", x = "Culture test") +
  geom_point(pch = 21, position = position_jitterdodge()) +
  scale_fill_discrete(labels=c("None detected", "Bacterial", "Viral", "Both")) +
  labs(fill = "Infection type")
  assign(paste0("p", i), p)
}
## [1] "CXCL10"
## [1] "PRF1"
## [1] "IFI27"
## [1] "CCL8"
## [1] "PTGS2"
## [1] "S100A9"
## [1] "PLAUR"
## [1] "TNFAIP3"
## [1] "IL1B"
p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9 + theme(legend.position="right") + plot_layout(nrow = 3)

Setting up data for sorted heatmaps

#setting up a temp matrix with Z-score scaled expression levels
temp = t(scale(t(assay(prld[,])))) 
temp[temp < -4] = -4
temp[temp > +4] = +4
temp[is.na(temp)] = 0

annotation_col = data.frame(
    Category = metadata$high_pathogens, 
            Viral_pathogen_abundance = metadata[,"viral_quantile"],
     Bacterial_pathogen_abundance = metadata[,"threebac_quantile"],
     Symptom_severity_score = metadata[,"PRSS"],
     Days_with_cold = metadata[,"days_with_cold"],
     Infection = factor(metadata[,"bv_both_none"], labels = c("None","Viral", "Bacterial", "Both"))
     )

Viral sorted heatmap

heatmap <- pheatmap(temp[viral_upDEGs,order(viralPathAbundance)], 

    annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),

    Resistance = c("No" = "white", "Yes" = "black")),
    annotation_col = annotation_col,
    show_colnames = F,
    show_rownames = F,
    border_color=NA,
    col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
    #col = colorRampPalette(c("navy", "white", "red"))(50),
    fontsize_row=6,
    cluster_cols = F,
    cluster_rows = T,
    annotation_legend = TRUE)

Bacterial sorted heatmap

heatmap <- pheatmap(temp[bac_upDEGs,order(bacPathAbundance)], 

annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),
    Resistance = c("No" = "green", "Yes" = "red")),
    annotation_col = annotation_col,
    show_colnames = F,
    show_rownames = F,
    border_color=NA,
    col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
    #col = colorRampPalette(c("navy", "white", "red"))(50),
    fontsize_row=6,
    cluster_cols = F,
    cluster_rows = T,
    annotation_legend = TRUE)

Combined heatmap

heatmap1 <- pheatmap(temp[viral_upDEGs,order(metadata$high_pathogen)], 

    annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),
    Resistance = c("No" = "white", "Yes" = "black")),
    annotation_col = annotation_col,
    show_colnames = F,
    show_rownames = F,
    border_color=NA,
    col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
    #col = colorRampPalette(c("navy", "white", "red"))(50),
    fontsize_row=6,
    cluster_cols = F,
    cluster_rows = T,
    annotation_legend = TRUE)

heatmap2 <- pheatmap(temp[bac_upDEGs,order(metadata$high_pathogen)], 

    annotation_colors = list(Infection = c(None = "white", Bacterial = "light blue", Viral = "gold", Both = "black"),
    Resistance = c("No" = "white", "Yes" = "black")),
    annotation_col = annotation_col,
    show_colnames = F,
    show_rownames = F,
    border_color=NA,
    col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(55),
    #col = colorRampPalette(c("navy", "white", "red"))(50),
    fontsize_row=6,
    cluster_cols = F,
    cluster_rows = T,
    annotation_legend = TRUE)

Jitter plots of host response scores vs pathogen abundance

temp = t(scale(t(assay(prld[,]))))

viralResponseScore = apply(temp[viral_upDEGs,],2,mean)

bacterialResponseScore = apply(temp[bac_upDEGs,],2,mean)


group = as.numeric(metadata[,"threebac_quantile"])

  p = ggplot(data.frame(cbind(bacterialResponseScore,group, row.names=NULL)), aes(y = bacterialResponseScore, x = as.factor(group),fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  geom_point(pch = 21, position = position_jitterdodge()) +
  labs(fill = "Pathogen abundance") +
  labs(x = "Bacterial pathogen abundance (decile)", y = "Bacterial response score\n(Mean expression Z-score of bacterial upDEGs)") +
    theme(legend.position = "none")
  
  p

group = as.numeric(metadata[,"viral_quantile"])

  p = ggplot(data.frame(cbind(viralResponseScore,group, row.names=NULL)), aes(y = viralResponseScore, x = as.factor(group),fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  geom_point(pch = 21, position = position_jitterdodge()) +
  labs(fill = "Pathogen abundance") +
  labs(x = "Viral pathogen abundance (decile)", y = "Viral response score\n(Mean expression Z-score of viral upDEGs)") +
    theme(legend.position = "none")
  p

group = as.numeric(metadata[,"high_pathogens"])

  p = ggplot(data.frame(cbind(bacterialResponseScore,group, row.names=NULL)), aes(y = bacterialResponseScore, x = as.factor(group),fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  geom_point(pch = 21, position = position_jitterdodge()) +
  scale_fill_discrete(labels=c("Both low", "Viral high", "Bacterial high", "Both high")) +
  labs(fill = "Pathogen abundance") +
  labs(x = "Pathogen abundance (decile)", y = "Bacterial response score\n(Mean expression Z-score of bacterial upDEGs)") 
  p

group = as.numeric(metadata[,"high_pathogens"])

  p = ggplot(data.frame(cbind(viralResponseScore,group, row.names=NULL)), aes(y = viralResponseScore, x = as.factor(group),fill=as.factor(group))) +
  geom_boxplot(outlier.shape=NA) +
  geom_point(pch = 21, position = position_jitterdodge()) +
  scale_fill_discrete(labels=c("Both low", "Viral high", "Bacterial high", "Both high")) +
  labs(fill = "Pathogen abundance") +
  labs(x = "Pathogen abundance (decile)", y = "Viral response score\n(Mean expression Z-score of viral upDEGs)")
  
  
  p

Individual gene expression levels across groups

pathOrNot = as.numeric(metadata[,"high_pathogens"]) # using categories "Both low" "Viral high" "Bacterial high" "Both high"

genelist = c("PTGS2", "S100A9", "PLAUR", "TNFAIP3", "SERPINA2", "CXCL2","IL1B", "CXCL11","IFNL1","CXCL10", "PRF1", "IFI27", "CCL8","OAS2")
for (i in 1:length(genelist)) {
  gene = genelist[i] 
  p = ggplot(data.frame(cbind(assay(prld)[gene,],pathOrNot)), aes(y = V1, x = as.factor(pathOrNot),fill=as.factor(pathOrNot))) +
  geom_boxplot(outlier.shape=NA) +
  ggtitle(gene) +
  theme(axis.title = element_blank(), legend.position="none") +
  geom_point(pch = 21, position = position_jitterdodge()) +
  scale_fill_discrete(labels=c("None detected", "Viral", "Bacterial", "Both")) +
  labs(fill = "Infection type")
  assign(paste0("p", i), p)
}
p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9 + p10 + p11 + p12 + p13 + p14 + plot_layout(nrow = 2)

Testing the specificity of the bacterial host response for SPN/HFLU/MCAT

Here, we will compute the correlation between the bacterial host response score and the total bacterial pathogen abundance (MCAT + SPN + HFLU), and compare this to 1000 randomly samples selections of three bacterial species from the data.

abundant_bacteria = setdiff(which(apply(matr,2,max) >= 10),grep("virus",colnames(matr)))
cors = vector(length = 1000)
for (i in 1:1000)
{

        randSetOfThree = sample(abundant_bacteria,3)
        threeBacAbundance = apply(matr[,randSetOfThree],1,sum)  # sum of their individual RPMs
        #ntile(threeBacAbundance, 10)
        cors[i] = cor(bacterialResponseScore,log(threeBacAbundance + 1,10))
}

hist(cors,xlim=c(-1,1),breaks=100)
abline(v = cor(bacterialResponseScore, log(bacPathAbundance + 1,10)))

ROC curves for predicting high/low pathogen abundance based on host response

Viral ROC

response = vector(length = nrow(metadata))
response[] = 0
response[which(metadata$high_pathogen == 1 | metadata$high_pathogen == 3)] = 1

prediction = viralResponseScore
bacPlot = plot.roc(roc(response,prediction))

#coords(bacPlot) #for a list of sensitivities and specificities

auc(response,prediction)
## [1] 0.80819

Bacterial ROC

response = vector(length = nrow(metadata))
response[] = 0
response[which(metadata$high_pathogen == 2 | metadata$high_pathogen == 3)] = 1

prediction = bacterialResponseScore
bacPlot = plot.roc(roc(response,prediction))

#coords(bacPlot) #for a list of sensitivities and specificities

auc(response,prediction)
## [1] 0.7902539